# Import and download packages
import requests
from bs4 import BeautifulSoup
import nltk
from collections import Counter
nltk.download('stopwords')
from wordcloud import WordCloud
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\newbe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# Get the Moby Dick HTML  
r = requests.get('https://s3.amazonaws.com/assets.datacamp.com/production/project_147/datasets/2701-h.htm')

# Set the correct text encoding of the HTML page
r.encoding = 'utf-8'

# Extract the HTML from the request object
html = r.text

# Print the first 2000 characters in html
print(html[0:2000])

<?xml version="1.0" encoding="utf-8"?>

<!DOCTYPE html
   PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" >

<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
  <head>
    <title>
      Moby Dick; Or the Whale, by Herman Melville
    </title>
    <style type="text/css" xml:space="preserve">

    body { background:#faebd0; color:black; margin-left:15%; margin-right:15%; text-align:justify }
    P { text-indent: 1em; margin-top: .25em; margin-bottom: .25em; }
    H1,H2,H3,H4,H5,H6 { text-align: center; margin-left: 15%; margin-right: 15%; }
    hr  { width: 50%; text-align: center;}
    .foot { margin-left: 20%; margin-right: 20%; text-align: justify; text-indent: -3em; font-size: 90%; }
    blockquote {font-size: 100%; margin-left: 0%; margin-right: 0%;}
    .mynote    {background-color: #DDE; color: #000; padding: .5em; margin-left: 10%; margin-right: 10%; font-family: sans-serif; font-size: 95%;}
    .toc       { margin-left: 10%; margin-bottom: .75em;}
    .toc2      { margin-left: 20%;}
    div.fig    { display:block; margin:0 auto; text-align:center; }
    div.middle { margin-left: 20%; margin-right: 20%; text-align: justify; }
    .figleft   {float: left; margin-left: 0%; margin-right: 1%;}
    .figright  {float: right; margin-right: 0%; margin-left: 1%;}
    .pagenum   {display:inline; font-size: 70%; font-style:normal;
               margin: 0; padding: 0; position: absolute; right: 1%;
               text-align: right;}
    pre        { font-family: times new roman; font-size: 100%; margin-left: 10%;}

    table      {margin-left: 10%;}

a:link {color:blue;
		text-decoration:none}
link {color:blue;
		text-decoration:none}
a:visited {color:blue;
		text-decoration:none}
a:hover {color:red}

</style>
  </head>
  <body>
<pre xml:space="preserve">

The Project Gutenberg EBook of Moby Dick; or The Whale, by Herman Melville

This eBook is for the use of anyone anywh

# Create a BeautifulSoup object from the HTML
html_soup = BeautifulSoup(html, "html.parser")

# Get the text out of the soup
moby_text = html_soup.get_text()

# Create a tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

# Tokenize the text
tokens = tokenizer.tokenize(moby_text)

# Create a list called words containing all tokens transformed to lowercase
words = [token.lower() for token in tokens]

# Print out the first eight words
words[:8]

['moby', 'dick', 'or', 'the', 'whale', 'by', 'herman', 'melville']

# Get the English stop words from nltk
stop_words = nltk.corpus.stopwords.words('english')

# Print out the first eight stop words
stop_words[:8]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all']

# Create a list words_ns containing all words that are in words but not in stop_words
words_no_stop = [word for word in words if word not in stop_words]

# Print the first five words_no_stop to check that stop words are gone
words_no_stop[:5]

['moby', 'dick', 'whale', 'herman', 'melville']

# Initialize a Counter object from our processed list of words
count = Counter(words_no_stop)

# Store ten most common words and their counts as top_ten
top_ten = count.most_common(10)

# Print the top ten words and their counts
print(top_ten)

[('whale', 1246), ('one', 925), ('like', 647), ('upon', 568), ('man', 527), ('ship', 519), ('ahab', 517), ('ye', 473), ('sea', 455), ('old', 452)]

# Create Word Cloud from word frequencies
wordcloud = WordCloud(
    width=1000, 
    height=500, 
    background_color="white", 
    colormap="viridis"
).generate_from_frequencies(count)

# Display the Word Cloud
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")  # Hide axes
plt.show()

Project Description¶

What are the most frequent words in Herman Melville's novel Moby Dick, and how often do they occur?¶